example.ipynb
InsightfulPy
data¶
In [1]:
import pandas as pd
# original data
data = pd.read_csv("\bank-additional-full.csv", sep=';')
In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', None)
In [3]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41188 entries, 0 to 41187 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 41188 non-null int64 1 job 41188 non-null object 2 marital 41188 non-null object 3 education 41188 non-null object 4 default 41188 non-null object 5 housing 41188 non-null object 6 loan 41188 non-null object 7 contact 41188 non-null object 8 month 41188 non-null object 9 day_of_week 41188 non-null object 10 duration 41188 non-null int64 11 campaign 41188 non-null int64 12 pdays 41188 non-null int64 13 previous 41188 non-null int64 14 poutcome 41188 non-null object 15 emp.var.rate 41188 non-null float64 16 cons.price.idx 41188 non-null float64 17 cons.conf.idx 41188 non-null float64 18 euribor3m 41188 non-null float64 19 nr.employed 41188 non-null float64 20 y 41188 non-null object dtypes: float64(5), int64(5), object(11) memory usage: 6.6+ MB
In [4]:
data.head().T
Out[4]:
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| age | 56 | 57 | 37 | 40 | 56 |
| job | housemaid | services | services | admin. | services |
| marital | married | married | married | married | married |
| education | basic.4y | high.school | high.school | basic.6y | high.school |
| default | no | unknown | no | no | no |
| housing | no | no | yes | no | no |
| loan | no | no | no | no | yes |
| contact | telephone | telephone | telephone | telephone | telephone |
| month | may | may | may | may | may |
| day_of_week | mon | mon | mon | mon | mon |
| duration | 261 | 149 | 226 | 151 | 307 |
| campaign | 1 | 1 | 1 | 1 | 1 |
| pdays | 999 | 999 | 999 | 999 | 999 |
| previous | 0 | 0 | 0 | 0 | 0 |
| poutcome | nonexistent | nonexistent | nonexistent | nonexistent | nonexistent |
| emp.var.rate | 1.1 | 1.1 | 1.1 | 1.1 | 1.1 |
| cons.price.idx | 93.994 | 93.994 | 93.994 | 93.994 | 93.994 |
| cons.conf.idx | -36.4 | -36.4 | -36.4 | -36.4 | -36.4 |
| euribor3m | 4.857 | 4.857 | 4.857 | 4.857 | 4.857 |
| nr.employed | 5191.0 | 5191.0 | 5191.0 | 5191.0 | 5191.0 |
| y | no | no | no | no | no |
Imports & def functions¶
In [5]:
import pandas as pd
from InsightfulPy.eda import *
from InsightfulPy.utils import *
In [6]:
import inspect
from InsightfulPy import eda, utils
eda_functions = inspect.getmembers(eda, inspect.isfunction)
print("EDA Functions:")
for idx, (func_name, _) in enumerate(eda_functions, start=1):
print(f"{idx}. {func_name}")
utils_functions = inspect.getmembers(utils, inspect.isfunction)
print("\nUtils Functions:")
for idx, (func_name, _) in enumerate(utils_functions, start=1):
print(f"{idx}. {func_name}")
EDA Functions: 1. analyze_data 2. box_plot_batches 3. calc_stats 4. calculate_skewness_kurtosis 5. cat_analyze_and_plot 6. cat_bar_batches 7. cat_high_cardinality 8. cat_pie_chart_batches 9. cat_summary 10. cat_vs_cat_pair_batch 11. columns_info 12. detect_mixed_data_types 13. detect_outliers 14. grouped_summary 15. iqr_trimmed_mean 16. is_datetime64_any_dtype 17. kde_batches 18. mad 19. num_analysis_and_plot 20. num_summary 21. num_vs_cat_box_violin_pair_batch 22. num_vs_num_scatterplot_pair_batch 23. plot_boxplots 24. print_columns 25. qq_plot_batches 26. show_missing 27. tabulate Utils Functions: 1. calc_stats 2. iqr_trimmed_mean 3. mad 4. tabulate
In [7]:
detect_mixed_data_types(data)
Out[7]:
'No mixed data types detected!'
In [8]:
cat_high_cardinality(data)
high_cardinality_columns
Out[8]:
[]
In [9]:
calc_stats(data['age'])
Out[9]:
{'Count': np.int64(41188),
'Mean': np.float64(40.02406040594348),
'Trimmed Mean': np.float64(39.599007834180604),
'MAD': np.float64(8.461535773937293),
'Std': np.float64(10.421249980934048),
'Min': np.int64(17),
'25%': np.float64(32.0),
'50%': np.float64(38.0),
'75%': np.float64(47.0),
'Max': np.int64(98),
'Mode': np.int64(31),
'Range': np.int64(81),
'IQR': np.float64(15.0),
'Variance': np.float64(108.60245116511788),
'Skewness': np.float64(0.7846968157646645),
'Kurtosis': np.float64(0.7913115311544336)}
In [10]:
calc_stats(data['duration'])
Out[10]:
{'Count': np.int64(41188),
'Mean': np.float64(258.2850101971448),
'Trimmed Mean': np.float64(203.25483322432962),
'MAD': np.float64(171.66613264329496),
'Std': np.float64(259.2792488364648),
'Min': np.int64(0),
'25%': np.float64(102.0),
'50%': np.float64(180.0),
'75%': np.float64(319.0),
'Max': np.int64(4918),
'Mode': np.int64(85),
'Range': np.int64(4918),
'IQR': np.float64(217.0),
'Variance': np.float64(67225.72887720143),
'Skewness': np.float64(3.263141255262832),
'Kurtosis': np.float64(20.247938014978796)}
Data Analysis and Cleaning¶
Dataset Overview¶
In [11]:
columns_info("Dataset Overview", data)
======== Dataset Overview: =========== Index Col Index Attribute Data Type Range Distinct Count ----- ---------- ------------------------------ --------------- ------------------------------ --------------- 1 15 emp.var.rate float64 -3.4 - 1.4 10 2 16 cons.price.idx float64 92.201 - 94.767 26 3 17 cons.conf.idx float64 -50.8 - -26.9 26 4 18 euribor3m float64 0.634 - 5.045 316 5 19 nr.employed float64 4963.6 - 5228.1 11 6 0 age int64 17 - 98 78 7 10 duration int64 0 - 4918 1544 8 11 campaign int64 1 - 56 42 9 12 pdays int64 0 - 999 27 10 13 previous int64 0 - 7 8 11 1 job object N/A 12 12 2 marital object N/A 4 13 3 education object N/A 8 14 4 default object N/A 3 15 5 housing object N/A 3 16 6 loan object N/A 3 17 7 contact object N/A 2 18 8 month object N/A 10 19 9 day_of_week object N/A 5 20 14 poutcome object N/A 3 21 20 y object N/A 2
In [12]:
analyze_data(data)
=== Numerical Analysis === | | Variable | N | Mean | SD | SE | 95% Conf. | Interval | |---:|:---------------|------:|----------:|---------:|-------:|------------:|-----------:| | 0 | age | 41188 | 40.0241 | 10.4212 | 0.0513 | 39.9234 | 40.1247 | | 1 | duration | 41188 | 258.285 | 259.279 | 1.2776 | 255.781 | 260.789 | | 2 | campaign | 41188 | 2.5676 | 2.77 | 0.0136 | 2.5408 | 2.5943 | | 3 | pdays | 41188 | 962.476 | 186.911 | 0.921 | 960.67 | 964.281 | | 4 | previous | 41188 | 0.173 | 0.4949 | 0.0024 | 0.1682 | 0.1777 | | 5 | emp.var.rate | 41188 | 0.0819 | 1.571 | 0.0077 | 0.0667 | 0.0971 | | 6 | cons.price.idx | 41188 | 93.5757 | 0.5788 | 0.0029 | 93.5701 | 93.5813 | | 7 | cons.conf.idx | 41188 | -40.5026 | 4.6282 | 0.0228 | -40.5473 | -40.4579 | | 8 | euribor3m | 41188 | 3.6213 | 1.7344 | 0.0085 | 3.6045 | 3.638 | | 9 | nr.employed | 41188 | 5167.04 | 72.2515 | 0.356 | 5166.34 | 5167.73 | === Categorical Analysis === | | Variable | Outcome | Count | Percent | |---:|:------------|:--------------------|--------:|----------:| | 0 | job | admin. | 10422 | 25.3 | | 1 | job | blue-collar | 9254 | 22.47 | | 2 | job | technician | 6743 | 16.37 | | 3 | job | services | 3969 | 9.64 | | 4 | job | management | 2924 | 7.1 | | 5 | job | retired | 1720 | 4.18 | | 6 | job | entrepreneur | 1456 | 3.54 | | 7 | job | self-employed | 1421 | 3.45 | | 8 | job | housemaid | 1060 | 2.57 | | 9 | job | unemployed | 1014 | 2.46 | | 10 | job | student | 875 | 2.12 | | 11 | job | unknown | 330 | 0.8 | | 12 | marital | married | 24928 | 60.52 | | 13 | marital | single | 11568 | 28.09 | | 14 | marital | divorced | 4612 | 11.2 | | 15 | marital | unknown | 80 | 0.19 | | 16 | education | university.degree | 12168 | 29.54 | | 17 | education | high.school | 9515 | 23.1 | | 18 | education | basic.9y | 6045 | 14.68 | | 19 | education | professional.course | 5243 | 12.73 | | 20 | education | basic.4y | 4176 | 10.14 | | 21 | education | basic.6y | 2292 | 5.56 | | 22 | education | unknown | 1731 | 4.2 | | 23 | education | illiterate | 18 | 0.04 | | 24 | default | no | 32588 | 79.12 | | 25 | default | unknown | 8597 | 20.87 | | 26 | default | yes | 3 | 0.01 | | 27 | housing | yes | 21576 | 52.38 | | 28 | housing | no | 18622 | 45.21 | | 29 | housing | unknown | 990 | 2.4 | | 30 | loan | no | 33950 | 82.43 | | 31 | loan | yes | 6248 | 15.17 | | 32 | loan | unknown | 990 | 2.4 | | 33 | contact | cellular | 26144 | 63.47 | | 34 | contact | telephone | 15044 | 36.53 | | 35 | month | may | 13769 | 33.43 | | 36 | month | jul | 7174 | 17.42 | | 37 | month | aug | 6178 | 15 | | 38 | month | jun | 5318 | 12.91 | | 39 | month | nov | 4101 | 9.96 | | 40 | month | apr | 2632 | 6.39 | | 41 | month | oct | 718 | 1.74 | | 42 | month | sep | 570 | 1.38 | | 43 | month | mar | 546 | 1.33 | | 44 | month | dec | 182 | 0.44 | | 45 | day_of_week | thu | 8623 | 20.94 | | 46 | day_of_week | mon | 8514 | 20.67 | | 47 | day_of_week | wed | 8134 | 19.75 | | 48 | day_of_week | tue | 8090 | 19.64 | | 49 | day_of_week | fri | 7827 | 19 | | 50 | poutcome | nonexistent | 35563 | 86.34 | | 51 | poutcome | failure | 4252 | 10.32 | | 52 | poutcome | success | 1373 | 3.33 | | 53 | y | no | 36548 | 88.73 | | 54 | y | yes | 4640 | 11.27 |
In [13]:
grouped_summary(data,groupby="y")
=== TableOne Summary Grouped by 'y' ===
Out[13]:
| Grouped by y | ||||||
|---|---|---|---|---|---|---|
| Missing | Overall | no | yes | P-Value | ||
| n | 41188 | 36548 | 4640 | |||
| age, mean (SD) | 0 | 40.0 (10.4) | 39.9 (9.9) | 40.9 (13.8) | <0.001 | |
| job, n (%) | admin. | 10422 (25.3) | 9070 (24.8) | 1352 (29.1) | <0.001 | |
| blue-collar | 9254 (22.5) | 8616 (23.6) | 638 (13.8) | |||
| entrepreneur | 1456 (3.5) | 1332 (3.6) | 124 (2.7) | |||
| housemaid | 1060 (2.6) | 954 (2.6) | 106 (2.3) | |||
| management | 2924 (7.1) | 2596 (7.1) | 328 (7.1) | |||
| retired | 1720 (4.2) | 1286 (3.5) | 434 (9.4) | |||
| self-employed | 1421 (3.5) | 1272 (3.5) | 149 (3.2) | |||
| services | 3969 (9.6) | 3646 (10.0) | 323 (7.0) | |||
| student | 875 (2.1) | 600 (1.6) | 275 (5.9) | |||
| technician | 6743 (16.4) | 6013 (16.5) | 730 (15.7) | |||
| unemployed | 1014 (2.5) | 870 (2.4) | 144 (3.1) | |||
| unknown | 330 (0.8) | 293 (0.8) | 37 (0.8) | |||
| marital, n (%) | divorced | 4612 (11.2) | 4136 (11.3) | 476 (10.3) | <0.001 | |
| married | 24928 (60.5) | 22396 (61.3) | 2532 (54.6) | |||
| single | 11568 (28.1) | 9948 (27.2) | 1620 (34.9) | |||
| unknown | 80 (0.2) | 68 (0.2) | 12 (0.3) | |||
| education, n (%) | basic.4y | 4176 (10.1) | 3748 (10.3) | 428 (9.2) | <0.001 | |
| basic.6y | 2292 (5.6) | 2104 (5.8) | 188 (4.1) | |||
| basic.9y | 6045 (14.7) | 5572 (15.2) | 473 (10.2) | |||
| high.school | 9515 (23.1) | 8484 (23.2) | 1031 (22.2) | |||
| illiterate | 18 (0.0) | 14 (0.0) | 4 (0.1) | |||
| professional.course | 5243 (12.7) | 4648 (12.7) | 595 (12.8) | |||
| university.degree | 12168 (29.5) | 10498 (28.7) | 1670 (36.0) | |||
| unknown | 1731 (4.2) | 1480 (4.0) | 251 (5.4) | |||
| default, n (%) | no | 32588 (79.1) | 28391 (77.7) | 4197 (90.5) | <0.001 | |
| unknown | 8597 (20.9) | 8154 (22.3) | 443 (9.5) | |||
| yes | 3 (0.0) | 3 (0.0) | ||||
| housing, n (%) | no | 18622 (45.2) | 16596 (45.4) | 2026 (43.7) | 0.058 | |
| unknown | 990 (2.4) | 883 (2.4) | 107 (2.3) | |||
| yes | 21576 (52.4) | 19069 (52.2) | 2507 (54.0) | |||
| loan, n (%) | no | 33950 (82.4) | 30100 (82.4) | 3850 (83.0) | 0.579 | |
| unknown | 990 (2.4) | 883 (2.4) | 107 (2.3) | |||
| yes | 6248 (15.2) | 5565 (15.2) | 683 (14.7) | |||
| contact, n (%) | cellular | 26144 (63.5) | 22291 (61.0) | 3853 (83.0) | <0.001 | |
| telephone | 15044 (36.5) | 14257 (39.0) | 787 (17.0) | |||
| month, n (%) | apr | 2632 (6.4) | 2093 (5.7) | 539 (11.6) | <0.001 | |
| aug | 6178 (15.0) | 5523 (15.1) | 655 (14.1) | |||
| dec | 182 (0.4) | 93 (0.3) | 89 (1.9) | |||
| jul | 7174 (17.4) | 6525 (17.9) | 649 (14.0) | |||
| jun | 5318 (12.9) | 4759 (13.0) | 559 (12.0) | |||
| mar | 546 (1.3) | 270 (0.7) | 276 (5.9) | |||
| may | 13769 (33.4) | 12883 (35.2) | 886 (19.1) | |||
| nov | 4101 (10.0) | 3685 (10.1) | 416 (9.0) | |||
| oct | 718 (1.7) | 403 (1.1) | 315 (6.8) | |||
| sep | 570 (1.4) | 314 (0.9) | 256 (5.5) | |||
| day_of_week, n (%) | fri | 7827 (19.0) | 6981 (19.1) | 846 (18.2) | <0.001 | |
| mon | 8514 (20.7) | 7667 (21.0) | 847 (18.3) | |||
| thu | 8623 (20.9) | 7578 (20.7) | 1045 (22.5) | |||
| tue | 8090 (19.6) | 7137 (19.5) | 953 (20.5) | |||
| wed | 8134 (19.7) | 7185 (19.7) | 949 (20.5) | |||
| duration, mean (SD) | 0 | 258.3 (259.3) | 220.8 (207.1) | 553.2 (401.2) | <0.001 | |
| campaign, mean (SD) | 0 | 2.6 (2.8) | 2.6 (2.9) | 2.1 (1.7) | <0.001 | |
| pdays, mean (SD) | 0 | 962.5 (186.9) | 984.1 (120.7) | 792.0 (403.4) | <0.001 | |
| previous, mean (SD) | 0 | 0.2 (0.5) | 0.1 (0.4) | 0.5 (0.9) | <0.001 | |
| poutcome, n (%) | failure | 4252 (10.3) | 3647 (10.0) | 605 (13.0) | <0.001 | |
| nonexistent | 35563 (86.3) | 32422 (88.7) | 3141 (67.7) | |||
| success | 1373 (3.3) | 479 (1.3) | 894 (19.3) | |||
| emp.var.rate, mean (SD) | 0 | 0.1 (1.6) | 0.2 (1.5) | -1.2 (1.6) | <0.001 | |
| cons.price.idx, mean (SD) | 0 | 93.6 (0.6) | 93.6 (0.6) | 93.4 (0.7) | <0.001 | |
| cons.conf.idx, mean (SD) | 0 | -40.5 (4.6) | -40.6 (4.4) | -39.8 (6.1) | <0.001 | |
| euribor3m, mean (SD) | 0 | 3.6 (1.7) | 3.8 (1.6) | 2.1 (1.7) | <0.001 | |
| nr.employed, mean (SD) | 0 | 5167.0 (72.3) | 5176.2 (64.6) | 5095.1 (87.6) | <0.001 | |
| y, n (%) | no | 36548 (88.7) | 36548 (100.0) | <0.001 | ||
| yes | 4640 (11.3) | 4640 (100.0) | ||||
In [14]:
data.replace('unknown', pd.NA, inplace=True)
print(f'Total Missing Percentage: {(data.isnull().sum().sum() / data.size) * 100:.2f}%')
# Replace 'unknown' with NaN
print("\nStatistical Summary of Missing Values:\n")
print(data.isna().sum())
show_missing(data)
Total Missing Percentage: 1.47% Statistical Summary of Missing Values: age 0 job 330 marital 80 education 1731 default 8597 housing 990 loan 990 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 y 0 dtype: int64
<Figure size 2000x800 with 0 Axes>
In [15]:
print(f"\nNumber of duplicate rows: {data.duplicated().sum()}\n")
duplicates = data[data.duplicated()]
duplicates
Number of duplicate rows: 12
Out[15]:
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1266 | 39 | blue-collar | married | basic.6y | no | no | no | telephone | may | thu | 124 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.855 | 5191.0 | no |
| 12261 | 36 | retired | married | <NA> | no | no | no | telephone | jul | thu | 88 | 1 | 999 | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.966 | 5228.1 | no |
| 14234 | 27 | technician | single | professional.course | no | no | no | cellular | jul | mon | 331 | 2 | 999 | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.962 | 5228.1 | no |
| 16956 | 47 | technician | divorced | high.school | no | yes | no | cellular | jul | thu | 43 | 3 | 999 | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.962 | 5228.1 | no |
| 18465 | 32 | technician | single | professional.course | no | yes | no | cellular | jul | thu | 128 | 1 | 999 | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.968 | 5228.1 | no |
| 20216 | 55 | services | married | high.school | <NA> | no | no | cellular | aug | mon | 33 | 1 | 999 | 0 | nonexistent | 1.4 | 93.444 | -36.1 | 4.965 | 5228.1 | no |
| 20534 | 41 | technician | married | professional.course | no | yes | no | cellular | aug | tue | 127 | 1 | 999 | 0 | nonexistent | 1.4 | 93.444 | -36.1 | 4.966 | 5228.1 | no |
| 25217 | 39 | admin. | married | university.degree | no | no | no | cellular | nov | tue | 123 | 2 | 999 | 0 | nonexistent | -0.1 | 93.200 | -42.0 | 4.153 | 5195.8 | no |
| 28477 | 24 | services | single | high.school | no | yes | no | cellular | apr | tue | 114 | 1 | 999 | 0 | nonexistent | -1.8 | 93.075 | -47.1 | 1.423 | 5099.1 | no |
| 32516 | 35 | admin. | married | university.degree | no | yes | no | cellular | may | fri | 348 | 4 | 999 | 0 | nonexistent | -1.8 | 92.893 | -46.2 | 1.313 | 5099.1 | no |
| 36951 | 45 | admin. | married | university.degree | no | no | no | cellular | jul | thu | 252 | 1 | 999 | 0 | nonexistent | -2.9 | 92.469 | -33.6 | 1.072 | 5076.2 | yes |
| 38281 | 71 | retired | single | university.degree | no | no | no | telephone | oct | tue | 120 | 1 | 999 | 0 | nonexistent | -3.4 | 92.431 | -26.9 | 0.742 | 5017.5 | no |
In [16]:
# Mode imputation for categorical columns using updated column names
for col in ['job', 'marital', 'education', 'default', 'housing', 'loan']:
data[col].fillna(data[col].mode()[0], inplace=True)
data.drop_duplicates(inplace=True)
print(data.isna().sum())
print(f"\nNumber of duplicate rows after removal: {data.duplicated().sum()}\n")
age 0 job 0 marital 0 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 y 0 dtype: int64 Number of duplicate rows after removal: 0
In [17]:
cat_analyze_and_plot(data, "y", visualize=True, subplot=False, )
Value counts and percentages for Y: | Y | Count | % Total | |:----|--------:|----------:| | no | 36535 | 88.73 | | yes | 4639 | 11.27 |
Exploratory Data Analysis (EDA)¶
In [18]:
cat_analyze_and_plot(data, "job","y", show_table=False,return_df=True)
Out[18]:
| Job | no | yes | Count | % Total | % no | % yes | |
|---|---|---|---|---|---|---|---|
| 0 | admin. | 9360 | 1388 | 10748 | 26.10 | 87.09 | 12.91 |
| 1 | blue-collar | 8614 | 638 | 9252 | 22.47 | 93.10 | 6.90 |
| 9 | technician | 6009 | 730 | 6739 | 16.37 | 89.17 | 10.83 |
| 7 | services | 3644 | 323 | 3967 | 9.63 | 91.86 | 8.14 |
| 4 | management | 2596 | 328 | 2924 | 7.10 | 88.78 | 11.22 |
| 5 | retired | 1284 | 434 | 1718 | 4.17 | 74.74 | 25.26 |
| 2 | entrepreneur | 1332 | 124 | 1456 | 3.54 | 91.48 | 8.52 |
| 6 | self-employed | 1272 | 149 | 1421 | 3.45 | 89.51 | 10.49 |
| 3 | housemaid | 954 | 106 | 1060 | 2.57 | 90.00 | 10.00 |
| 10 | unemployed | 870 | 144 | 1014 | 2.46 | 85.80 | 14.20 |
| 8 | student | 600 | 275 | 875 | 2.13 | 68.57 | 31.43 |
In [19]:
num_analysis_and_plot(data, "age", "y", subplot=True)
### Analysis for 'age' by y ### | | Age | Overall | y: no | y: yes | |---:|:-------------|-------------:|-------------:|------------:| | 0 | Count | 41174 | 36535 | 4639 | | 1 | Mean | 40.0236 | 39.9107 | 40.9123 | | 2 | Trimmed Mean | 39.5992 | 39.6618 | 40.1271 | | 3 | MAD | 8.46118 | 8.13008 | 11.1292 | | 4 | Std | 10.4206 | 9.89702 | 13.8388 | | 5 | Min | 17 | 17 | 17 | | 6 | 25% | 32 | 32 | 31 | | 7 | 50% | 38 | 38 | 37 | | 8 | 75% | 47 | 47 | 50 | | 9 | Max | 98 | 95 | 98 | | 10 | Mode | 31 | 31 | 31 | | 11 | Range | 81 | 78 | 81 | | 12 | IQR | 15 | 15 | 19 | | 13 | Variance | 108.588 | 97.9511 | 191.513 | | 14 | Skewness | 0.784604 | 0.652591 | 0.999532 | | 15 | Kurtosis | 0.791372 | 0.363456 | 0.670676 |
Pair_Batch¶
In [20]:
cat_bar_batches(data)
Out[20]:
| batch_num | batch_columns | |
|---|---|---|
| 0 | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
In [21]:
cat_pie_chart_batches(data)
Out[21]:
| batch_num | batch_columns | |
|---|---|---|
| 0 | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
In [22]:
kde_batches(data)
Out[22]:
| Batch Number | Columns | |
|---|---|---|
| 0 | 1 | [age, duration, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed] |
In [23]:
box_plot_batches(data)
Out[23]:
| Batch Number | Columns | |
|---|---|---|
| 0 | 1 | [age, duration, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed] |
In [24]:
qq_plot_batches(data)
Out[24]:
| Batch Number | Columns | |
|---|---|---|
| 0 | 1 | [age, duration, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed] |
In [25]:
num_vs_num_scatterplot_pair_batch(data)
Out[25]:
| Pair_Num | Pair_Column | Batch_Num | Batch_Columns | |
|---|---|---|---|---|
| 0 | 0 | age | 1 | [duration, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed] |
| 1 | 1 | duration | 1 | [age, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed] |
| 2 | 2 | campaign | 1 | [age, duration, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed] |
| 3 | 3 | pdays | 1 | [age, duration, campaign, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed] |
| 4 | 4 | previous | 1 | [age, duration, campaign, pdays, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed] |
| 5 | 5 | emp.var.rate | 1 | [age, duration, campaign, pdays, previous, cons.price.idx, cons.conf.idx, euribor3m, nr.employed] |
| 6 | 6 | cons.price.idx | 1 | [age, duration, campaign, pdays, previous, emp.var.rate, cons.conf.idx, euribor3m, nr.employed] |
| 7 | 7 | cons.conf.idx | 1 | [age, duration, campaign, pdays, previous, emp.var.rate, cons.price.idx, euribor3m, nr.employed] |
| 8 | 8 | euribor3m | 1 | [age, duration, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, nr.employed] |
| 9 | 9 | nr.employed | 1 | [age, duration, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m] |
In [26]:
cat_vs_cat_pair_batch(data)
Out[26]:
| Pair_Num | Pair_Column | Original_Unique | plot_Unique | Batch_Num | Batch_Columns | |
|---|---|---|---|---|---|---|
| 0 | 0 | job | 11 | 11 | 1 | [marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 1 | 1 | marital | 3 | 3 | 1 | [job, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 2 | 2 | education | 7 | 7 | 1 | [job, marital, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 3 | 3 | default | 2 | 2 | 1 | [job, marital, education, housing, loan, contact, month, day_of_week, poutcome, y] |
| 4 | 4 | housing | 2 | 2 | 1 | [job, marital, education, default, loan, contact, month, day_of_week, poutcome, y] |
| 5 | 5 | loan | 2 | 2 | 1 | [job, marital, education, default, housing, contact, month, day_of_week, poutcome, y] |
| 6 | 6 | contact | 2 | 2 | 1 | [job, marital, education, default, housing, loan, month, day_of_week, poutcome, y] |
| 7 | 7 | month | 10 | 10 | 1 | [job, marital, education, default, housing, loan, contact, day_of_week, poutcome, y] |
| 8 | 8 | day_of_week | 5 | 5 | 1 | [job, marital, education, default, housing, loan, contact, month, poutcome, y] |
| 9 | 9 | poutcome | 3 | 3 | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, y] |
| 10 | 10 | y | 2 | 2 | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome] |
In [27]:
num_vs_cat_box_violin_pair_batch(data)
Out[27]:
| pair_num | pair_column | batch_num | batch_column | |
|---|---|---|---|---|
| 0 | 0 | age | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 1 | 1 | duration | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 2 | 2 | campaign | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 3 | 3 | pdays | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 4 | 4 | previous | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 5 | 5 | emp.var.rate | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 6 | 6 | cons.price.idx | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 7 | 7 | cons.conf.idx | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 8 | 8 | euribor3m | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
| 9 | 9 | nr.employed | 1 | [job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y] |
In [28]:
cat_bar_batches(data, batch_num=1)
cat_pie_chart_batches(data, batch_num=1)
In [29]:
kde_batches(data, batch_num=1)
box_plot_batches(data, batch_num=1)
qq_plot_batches(data, batch_num=1)
In [30]:
num_vs_num_scatterplot_pair_batch(data, pair_num=0, batch_num=1,hue_column="y")
cat_vs_cat_pair_batch(data, pair_num=0, batch_num=1)
num_vs_cat_box_violin_pair_batch(data, pair_num=0, batch_num=1)
Preprocessing¶
In [31]:
plot_boxplots(data)
calculate_skewness_kurtosis(data)
Out[31]:
| Skewness | Kurtosis | |
|---|---|---|
| age | 0.784604 | 0.791372 |
| duration | 3.262748 | 20.243157 |
| campaign | 4.761966 | 36.970631 |
| pdays | -4.921252 | 22.220234 |
| previous | 3.831288 | 20.101055 |
| emp.var.rate | -0.723985 | -1.062807 |
| cons.price.idx | -0.230825 | -0.829914 |
| cons.conf.idx | 0.302983 | -0.358989 |
| euribor3m | -0.709117 | -1.406900 |
| nr.employed | -1.044253 | -0.003697 |
In [32]:
detect_outliers(data)
Out[32]:
| Column | Q1 | Q3 | IQR | Lower Bound | Upper Bound | Total Distinct | Outliers Distinct | Outliers Count | Outliers % | Outliers (First 10) | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | age | 32.0 | 47.0 | 15.0 | 9.50 | 69.50 | 78 | 25 | 468 | 1.14% | 70, 71, 72, 73, 74, 75, 76, 77, 78, 79... |
| 1 | duration | 102.0 | 319.0 | 217.0 | -223.50 | 644.50 | 1544 | 899 | 2963 | 7.2% | 645, 646, 647, 648, 649, 650, 651, 652, 653, 654... |
| 2 | campaign | 1.0 | 3.0 | 2.0 | -2.00 | 6.00 | 42 | 36 | 2406 | 5.84% | 7, 8, 9, 10, 11, 12, 13, 14, 15, 16... |
| 3 | pdays | 999.0 | 999.0 | 0.0 | 999.00 | 999.00 | 27 | 26 | 1515 | 3.68% | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9... |
| 4 | previous | 0.0 | 0.0 | 0.0 | 0.00 | 0.00 | 8 | 7 | 5625 | 13.66% | 1, 2, 3, 4, 5, 6, 7 |
| 5 | cons.conf.idx | -42.7 | -36.4 | 6.3 | -52.15 | -26.95 | 26 | 1 | 446 | 1.08% | -26.9 |
In [ ]: